Clean the orginal weight dataset

original = tibble(
  read.csv("./dataset/Student_Weight_Status_Category_Reporting_Results__Beginning_2010.csv")
) %>%
  janitor::clean_names() %>%
  filter(year_reported %in% c("2010-2011", "2018-2019")) %>% #only need data in year 2010-2011 or 2018-2019
  select(-location_code, -region, -area_name)  # the only location information we need is county name

Clean dataset with geolocation information

#import coordinates data set 
coordinates = tibble(
  read.csv("./dataset/Geocodes_USA_with_Counties.csv")
) %>%
  filter(state == "NY") %>%  # filter out counties outside NY state
  select(county, latitude, longitude) %>% # only information we need is county name and geolocation
  drop_na() %>%
  group_by(county) %>%
  summarise(latitude = mean(latitude), longitude = mean(longitude)) %>% #different location in each county variaed slightly, so we take the mean of each county's geolocation
  filter(!county == "") %>% # one county's name input is blank
  mutate(county = toupper(county)) # to swith county name to uppercase

combine two data set

weight_df = left_join(original, coordinates, by = "county")

average_percent = 
  weight_df %>% 
  group_by(county) %>% 
  drop_na(percent_overweight_or_obese) %>%
  summarize(average = mean(percent_overweight_or_obese), .groups = "keep")

sum_df = left_join(weight_df, average_percent, by = "county")
sum_df
## # A tibble: 6,237 x 15
##    county year_reported number_overweig… percent_overwei… number_obese
##    <chr>  <chr>                    <int>            <dbl>        <int>
##  1 ALBANY 2010-2011                  184             13.9          354
##  2 ALBANY 2010-2011                   83             12.6          201
##  3 ALBANY 2010-2011                  101             15.2          153
##  4 ALBANY 2010-2011                   94             19.5           67
##  5 ALBANY 2010-2011                   47             20.8           34
##  6 ALBANY 2010-2011                   47             18.4           33
##  7 ALBANY 2010-2011                  216             25.6          184
##  8 ALBANY 2010-2011                   63             17.9           98
##  9 ALBANY 2010-2011                  153             31.1           86
## 10 ALBANY 2010-2011                   39             13.8           56
## # … with 6,227 more rows, and 10 more variables: percent_obese <dbl>,
## #   number_overweight_or_obese <int>, percent_overweight_or_obese <dbl>,
## #   grade_level <chr>, number_healthy_weight <int>,
## #   percent_healthy_weight <dbl>, sex <chr>, latitude <dbl>, longitude <dbl>,
## #   average <dbl>
pal <- colorNumeric(
  palette = "viridis",
  domain = sum_df$average)

sum_df %>% 
  mutate(
    click_label = 
      str_c("<b>year-window:", year_reported, "</b><br>", average, "percent is overweight or obses<br>", grade_level, " grade")) %>% 
  leaflet() %>% 
  addProviderTiles(providers$CartoDB.Positron) %>% 
  addCircleMarkers(~longitude, ~latitude, radius = 5, color = ~pal(average), popup = ~click_label)
weight_df %>%
  mutate(
    text_label = str_c("Year-window: ", year_reported, "\nSchool grade: ", grade_level,"\n Obisity percentage: ", percent_obese)) %>%
  plot_ly(x = ~longitude, y = ~latitude, type = "scatter", mode = "markers",
          color = ~percent_obese, text = ~text_label, alpha = 0.5)